import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
import copy
aa=pd.DataFrame([[0,3,0,'Red'],[2,0,0,'Red'],[0,1,3,'Red'],[0,1,2,'Green'],[-1,0,1,'Green'],[1,1,2,'Red']],columns=['X1','X2','X3','Y'])
aa0=aa[['X1','X2','X3']]
x=[0,0,0]
[math.dist(aa0.iloc[i,], x) for i in range(len(aa))]
[3.0, 2.0, 3.1622776601683795, 2.23606797749979, 1.4142135623730951, 2.449489742783178]
aa.iloc[[1,3,4],3]
1 Red 3 Green 4 Green Name: Y, dtype: object
(a) Since there are more 'Green' labels in x=[0,0,0]'s neighbour, I think x=[0,0,0] should be labeled as 'Green'.
(b) If Bayes deci-sion boundary(K=3) in this problem is highly nonlinear, which means the boundary is wiggly, we need to decrease K. That's because, we need a more complex model, and if the K is so large, it might cause large bias. Also, since the dataset is larger, the variance caused by more complex model will reduce. Thus, based on the variance bias trade-off, we want to let K smaller.
df_train =pd.read_csv('diabetes_train.csv')
df_test =pd.read_csv('diabetes_test.csv')
df_train.describe()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 |
| mean | 4.053738 | 124.752336 | 69.672897 | 20.072430 | 84.067757 | 32.549065 | 0.502308 | 34.329439 | 0.478972 |
| std | 3.538270 | 32.822486 | 19.135913 | 16.555687 | 124.157706 | 7.669440 | 0.347304 | 11.926841 | 0.500142 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 103.000000 | 64.000000 | 0.000000 | 0.000000 | 27.875000 | 0.253750 | 25.000000 | 0.000000 |
| 50% | 3.000000 | 123.000000 | 72.000000 | 22.500000 | 0.000000 | 32.500000 | 0.402500 | 31.000000 | 0.000000 |
| 75% | 7.000000 | 145.000000 | 80.000000 | 32.000000 | 130.000000 | 36.800000 | 0.675000 | 41.250000 | 1.000000 |
| max | 17.000000 | 199.000000 | 114.000000 | 99.000000 | 846.000000 | 59.400000 | 2.420000 | 81.000000 | 1.000000 |
df_train.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
[sum(df_train.iloc[:,i]==0) for i in range(8)]
[64, 3, 19, 139, 218, 6, 0, 0]
[sum(df_test.iloc[:,i]==0) for i in range(8)]
[20, 1, 9, 32, 51, 1, 0, 0]
df_train.plot(kind='box',
subplots=True,
sharey=False, # use different y scale
figsize=(20, 5));
After using descriptive statistics and ploting boxplot, we found 'Glucose', 'BloodPressure','SkinThickness','Insulin' and 'BMI' has zero value, which does not make sense.Since zero is not reasonable in 'Glucose', 'BloodPressure','SkinThickness','Insulin' and 'BMI', thus we should treat them as nan value.
df_train['SkinThickness_updated']=df_train['SkinThickness'].replace(0,np.nan)
df_train['Insulin_updated']=df_train['Insulin'].replace(0,np.nan)
df_train['BMI_updated']=df_train['BMI'].replace(0,np.nan)
df_train['Glucose_updated']=df_train['Glucose'].replace(0,np.nan)
df_train['BloodPressure_updated']=df_train['BloodPressure'].replace(0,np.nan)
df_train=df_train.drop(['Glucose','BloodPressure','SkinThickness','Insulin','BMI'],axis=1)
df_test['SkinThickness_updated']=df_test['SkinThickness'].replace(0,np.nan)
df_test['Insulin_updated']=df_test['Insulin'].replace(0,np.nan)
df_test['BMI_updated']=df_test['BMI'].replace(0,np.nan)
df_test['Glucose_updated']=df_test['Glucose'].replace(0,np.nan)
df_test['BloodPressure_updated']=df_test['BloodPressure'].replace(0,np.nan)
df_test=df_test.drop(['Glucose','BloodPressure','SkinThickness','Insulin','BMI'],axis=1)
df_train.isnull().sum()
Pregnancies 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 SkinThickness_updated 139 Insulin_updated 218 BMI_updated 6 Glucose_updated 3 BloodPressure_updated 19 dtype: int64
df_test.isna().sum()
Pregnancies 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 SkinThickness_updated 32 Insulin_updated 51 BMI_updated 1 Glucose_updated 1 BloodPressure_updated 9 dtype: int64
import missingno as msno
msno.matrix(df_train, figsize=(12,4));
msno.bar(df_train, figsize=(12,4));
From graph, we could conclude that 'Glucose', 'BloodPressure','SkinThickness','Insulin' and 'BMI' has missing value. And the situation is more serious in columns 'SkinThickness' and 'Insulin'. Thus, we need to implement data imputation to fill the missing value.
df_train.shape
(428, 9)
Since our data amount is not large enough, we had better not use dropna method to process missing value
df_train.plot(kind='box',
subplots=True,
sharey=False, # use different y scale
figsize=(20, 5));
From the graph, it is obvious that 'BloodPressure','SkinThickness','Insulin' and 'BMI' has outliers. Thus we choose to use median imputation for 'BloodPressure','SkinThickness','Insulin' and 'BMI'. Use mean imputation for 'Glucose'.
df_train_clean = copy.deepcopy(df_train)
df_test_clean = copy.deepcopy(df_test)
df_train_clean.iloc[:,4] = df_train_clean.iloc[:,4].fillna(df_train_clean.iloc[:,4].median())
df_test_clean.iloc[:,4] = df_test_clean.iloc[:,4].fillna(df_test_clean.iloc[:,4].median())
df_train_clean.iloc[:,5] = df_train_clean.iloc[:,5].fillna(df_train_clean.iloc[:,5].median())
df_test_clean.iloc[:,5] = df_test_clean.iloc[:,5].fillna(df_test_clean.iloc[:,5].median())
df_train_clean.iloc[:,6] = df_train_clean.iloc[:,6].fillna(df_train_clean.iloc[:,6].median())
df_test_clean.iloc[:,6] = df_test_clean.iloc[:,6].fillna(df_test_clean.iloc[:,6].median())
df_train_clean.iloc[:,7] = df_train_clean.iloc[:,7].fillna(df_train_clean.iloc[:,7].mean())
df_test_clean.iloc[:,7] = df_test_clean.iloc[:,7].fillna(df_test_clean.iloc[:,7].mean())
df_train_clean.iloc[:,8] = df_train_clean.iloc[:,8].fillna(df_train_clean.iloc[:,8].median())
df_test_clean.iloc[:,8] = df_test_clean.iloc[:,8].fillna(df_test_clean.iloc[:,8].median())
df_train_clean.isna().sum()
Pregnancies 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 SkinThickness_updated 0 Insulin_updated 0 BMI_updated 0 Glucose_updated 0 BloodPressure_updated 0 dtype: int64
df_test_clean.isna().sum()
Pregnancies 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 SkinThickness_updated 0 Insulin_updated 0 BMI_updated 0 Glucose_updated 0 BloodPressure_updated 0 dtype: int64
Now there is no missing value!! Our data is cleaned!!!
df_train_clean['Outcome'].value_counts()
0 223 1 205 Name: Outcome, dtype: int64
df_test_clean['Outcome'].value_counts()
1 63 0 45 Name: Outcome, dtype: int64
It could be concluded that the 'Outcome' structure in test and train data is similar. Thus, we could start to our EDA and KNN.
sns.pairplot(df_train_clean, diag_kind="kde", hue='Outcome', height=2);
df_train_clean.plot(kind='box',by='Outcome',
subplots=True,
sharey=False, # use different y scale
figsize=(20, 5));
import plotly.express as px
px.box(df_train_clean.melt(id_vars=['Outcome'], var_name = "col" ), x = "Outcome", y='value',
color = 'Outcome',facet_col='col').update_yaxes(matches=None)
We could notice the 'Glucose' with ('BloodPressure','SkinThickness','Insulin', 'BMI') pairplots seems obviously could identify the "Outcome". Also, the distribution of 'Glucose','BloodPressure','SkinThickness','Insulin', 'BMI','Age','DiabetesPedigreeFunction' columns are all slightly different depending on the different 'Outcome', which could be invesigated in depth.
from sklearn.preprocessing import StandardScaler
# extract design matrix and target variable
X_train = df_train_clean.drop(['Outcome'], axis=1)
y_train = df_train_clean['Outcome']
X_test = df_test_clean.drop(['Outcome'], axis=1)
y_test = df_test_clean['Outcome']
# get predictors' names
col_names = X_train.columns
# initiate StandardScaler
scaler = StandardScaler()
# fit only on X_train
scaler.fit(X_train)
# scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=col_names)
X_test = pd.DataFrame(X_test, columns=col_names)
X_train.describe()
| Pregnancies | DiabetesPedigreeFunction | Age | SkinThickness_updated | Insulin_updated | BMI_updated | Glucose_updated | BloodPressure_updated | |
|---|---|---|---|---|---|---|---|---|
| count | 4.280000e+02 | 4.280000e+02 | 4.280000e+02 | 4.280000e+02 | 4.280000e+02 | 4.280000e+02 | 4.280000e+02 | 4.280000e+02 |
| mean | 2.490220e-17 | -6.640586e-17 | -7.678178e-17 | -1.400749e-16 | -6.640586e-17 | 2.116687e-16 | 2.365709e-16 | -4.233374e-16 |
| std | 1.001170e+00 | 1.001170e+00 | 1.001170e+00 | 1.001170e+00 | 1.001170e+00 | 1.001170e+00 | 1.001170e+00 | 1.001170e+00 |
| min | -1.147025e+00 | -1.223149e+00 | -1.118908e+00 | -2.555907e+00 | -1.506390e+00 | -2.241627e+00 | -2.627938e+00 | -3.632269e+00 |
| 25% | -8.640698e-01 | -7.165163e-01 | -7.831376e-01 | -3.434050e-01 | -2.025617e-01 | -7.313489e-01 | -6.964103e-01 | -6.728765e-01 |
| 50% | -2.981603e-01 | -2.877165e-01 | -2.794819e-01 | 2.067759e-02 | -2.025617e-01 | -6.894646e-02 | -8.475997e-02 | 8.811008e-02 |
| 75% | 8.336587e-01 | 4.978159e-01 | 5.809300e-01 | 2.447284e-01 | -2.025617e-01 | 5.745302e-01 | 6.234667e-01 | 5.954345e-01 |
| max | 3.663206e+00 | 5.528106e+00 | 3.917649e+00 | 7.750431e+00 | 7.571310e+00 | 3.996312e+00 | 2.361841e+00 | 3.470273e+00 |
k_range = np.arange(1,31)
train_error = []
test_error = []
for k in k_range:
# setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=k)
# fit the model
knn.fit(X_train, y_train)
# train error rate
pred_train = knn.predict(X_train)
train_error.append(np.mean(pred_train != y_train))
# test error rate
pred_test = knn.predict(X_test)
test_error.append(np.mean(pred_test != y_test))
plt.plot(k_range, train_error, label='Training Error')
plt.plot(k_range, test_error, label='Test Error')
plt.legend()
plt.title('Training and test error rate for KNN(mean and median imputation)')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()
test_error.index(min(test_error))
2
min(test_error)
0.21296296296296297
As a result, the best K value is 3, since when K=3, test_error will be minimized.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
knn = KNeighborsClassifier(n_neighbors=3)
# fit the model on the training set
knn.fit(X_train, y_train);
pred_train = knn.predict(X_train)
# make predictions on test data
pred_test = knn.predict(X_test)
cm_train = confusion_matrix(y_train, pred_train)
cm_display = ConfusionMatrixDisplay(confusion_matrix = cm_train, display_labels = ['High', 'Low'])
cm_display.plot()
plt.show()
With the confusion matrix, we could see the predicted value is correct in most situation.
df_train_clean0 = copy.deepcopy(df_train)
df_test_clean0 = copy.deepcopy(df_test)
X_train_mice = df_train_clean0.drop(['Outcome'], axis=1)
y_train_mice = df_train_clean0['Outcome']
X_test_mice = df_test_clean0.drop(['Outcome'], axis=1)
y_test_mice = df_test_clean0['Outcome']
col_names = X_train_mice.columns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=42)
X_train_mice = imp.fit_transform(X_train_mice)
X_test_mice = imp.fit_transform(X_test_mice)
X_train_mice=pd.DataFrame(X_train_mice,columns=col_names)
X_test_mice=pd.DataFrame(X_test_mice,columns=col_names)
scaler1 = StandardScaler()
# fit only on X_train
scaler1.fit(X_train_mice)
# scale both X_train and X_test
X_train_mice = scaler.transform(X_train_mice)
X_test_mice = scaler.transform(X_test_mice)
X_train_mice = pd.DataFrame(X_train_mice, columns=col_names)
X_test_mice = pd.DataFrame(X_test_mice, columns=col_names)
k_range = np.arange(1,31)
train_error = []
test_error = []
for k in k_range:
# setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=k)
# fit the model
knn.fit(X_train_mice, y_train)
# train error rate
pred_train = knn.predict(X_train_mice)
train_error.append(np.mean(pred_train != y_train_mice))
# test error rate
pred_test = knn.predict(X_test_mice)
test_error.append(np.mean(pred_test != y_test_mice))
plt.plot(k_range, train_error, label='Training Error')
plt.plot(k_range, test_error, label='Test Error')
plt.legend()
plt.title('Training and test error rate for KNN(MICE)')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()
test_error.index(min(test_error))
2
min(test_error)
0.25
For MICE, the best K value is still 3, since when K=3, test_error will be minimized.
In a conclusion, K=3 is the proper value for KNN model.